MVP

Question 1

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.0     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggfortify)
library(fastDummies)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(modelr)
## 
## Attaching package: 'modelr'
## The following object is masked from 'package:mosaic':
## 
##     resample
## The following object is masked from 'package:ggformula':
## 
##     na.warn
houses <- read_csv("data/kc_house_data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): id
## dbl  (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(houses)
##       id                 date                            price        
##  Length:21613       Min.   :2014-05-02 00:00:00.00   Min.   :  75000  
##  Class :character   1st Qu.:2014-07-22 00:00:00.00   1st Qu.: 321950  
##  Mode  :character   Median :2014-10-16 00:00:00.00   Median : 450000  
##                     Mean   :2014-10-29 04:38:01.96   Mean   : 540088  
##                     3rd Qu.:2015-02-17 00:00:00.00   3rd Qu.: 645000  
##                     Max.   :2015-05-27 00:00:00.00   Max.   :7700000  
##     bedrooms        bathrooms      sqft_living       sqft_lot      
##  Min.   : 0.000   Min.   :0.000   Min.   :  290   Min.   :    520  
##  1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427   1st Qu.:   5040  
##  Median : 3.000   Median :2.250   Median : 1910   Median :   7618  
##  Mean   : 3.371   Mean   :2.115   Mean   : 2080   Mean   :  15107  
##  3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10688  
##  Max.   :33.000   Max.   :8.000   Max.   :13540   Max.   :1651359  
##      floors        waterfront            view          condition    
##  Min.   :1.000   Min.   :0.000000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :1.500   Median :0.000000   Median :0.0000   Median :3.000  
##  Mean   :1.494   Mean   :0.007542   Mean   :0.2343   Mean   :3.409  
##  3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000  
##  Max.   :3.500   Max.   :1.000000   Max.   :4.0000   Max.   :5.000  
##      grade          sqft_above   sqft_basement       yr_built   
##  Min.   : 1.000   Min.   : 290   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.000   Median :1560   Median :   0.0   Median :1975  
##  Mean   : 7.657   Mean   :1788   Mean   : 291.5   Mean   :1971  
##  3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997  
##  Max.   :13.000   Max.   :9410   Max.   :4820.0   Max.   :2015  
##   yr_renovated       zipcode           lat             long       
##  Min.   :   0.0   Min.   :98001   Min.   :47.16   Min.   :-122.5  
##  1st Qu.:   0.0   1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3  
##  Median :   0.0   Median :98065   Median :47.57   Median :-122.2  
##  Mean   :  84.4   Mean   :98078   Mean   :47.56   Mean   :-122.2  
##  3rd Qu.:   0.0   3rd Qu.:98118   3rd Qu.:47.68   3rd Qu.:-122.1  
##  Max.   :2015.0   Max.   :98199   Max.   :47.78   Max.   :-121.3  
##  sqft_living15    sqft_lot15    
##  Min.   : 399   Min.   :   651  
##  1st Qu.:1490   1st Qu.:  5100  
##  Median :1840   Median :  7620  
##  Mean   :1987   Mean   : 12768  
##  3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :6210   Max.   :871200

No missing data

# removing columns that aren't needed
houses_tidy <- houses %>% 
  select(-c(date, id, sqft_living15, sqft_lot15, zipcode)) %>% 
# converting waterfront column to logical
  mutate(waterfront = as.logical(waterfront)) %>% 
# converting yr_renovated to logical renovated column
  mutate(yr_renovated = ifelse(yr_renovated == 0, FALSE, TRUE)) %>% 
  rename(renovated = yr_renovated) %>% 
# convert view to factor (categorical ordinal)
  mutate(view = factor(view, levels = c(0, 1, 2, 3, 4))) %>% 
# convert condition to factor (categorical ordinal)
  mutate(condition = factor(condition, levels = c(1, 2, 3, 4, 5))) %>% 
# lets group together grade - low (1-3), average (4-10), high (11-13)
  mutate(grade = case_when(
    grade < 4 ~ "low", 
    grade >10 ~ "high",
    TRUE ~ "average"
  ),
  grade = factor(grade, levels = c("low", "average", "high"))
  )

Question 2

alias(lm(price ~ ., data = houses_tidy))
## Model :
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + condition + grade + sqft_above + sqft_basement + 
##     yr_built + renovated + lat + long
## 
## Complete :
##               (Intercept) bedrooms bathrooms sqft_living sqft_lot floors
## sqft_basement  0           0        0         1           0        0    
##               waterfrontTRUE view1 view2 view3 view4 condition2 condition3
## sqft_basement  0              0     0     0     0     0          0        
##               condition4 condition5 gradeaverage gradehigh sqft_above yr_built
## sqft_basement  0          0          0            0        -1          0      
##               renovatedTRUE lat long
## sqft_basement  0             0   0

alias has detected that sqft_basement can be calculated from sqft_living minus sqft_above. This means we can removed sqft_basement.

houses_tidy <- houses_tidy %>% 
  select(-sqft_basement)

Question 3

houses_tidy_numeric <- houses_tidy %>%
  select_if(is.numeric)

houses_tidy_nonnumeric <- houses_tidy %>%
  select_if(function(x) !is.numeric(x))

houses_tidy_nonnumeric$price <- houses_tidy$price
ggpairs(houses_tidy_numeric, progress = FALSE)

Correlation with price - Numeric Variables

Strong positive correlations 1. sqft_living (0.702) 2. sqft_above (0.606)

Moderate correlation 3. bathrooms (0.525)

Weak correlations 4. bedrooms (0.308) 5. latitude (0.307) 6. floors (0.257)

Very week correlations 7. sqft_lot (0.09) 8. yr_built (0.054) 9. longitude (0.022)

ggpairs(houses_tidy_nonnumeric, progress = FALSE)

Correlation with price - Non-Numeric Variables

waterfront - appears to affect price

View - there is a correlation with price, the median of the 5 levels generally increases with better views (less obvious between levels 1 and 2). The highest and lowest levels of view clearly have an effect on price.

condition - no obvious correlation here

grade - looks like a good correlation here with a decent increase in price related to the highest grade of building. This looks like the strongest correlation from the non-numeric variables.

renovated - may be a correlation but not a strong one from looking at boxplots

Conclusion predictors to investigate in order of potential grade = “high” waterfront = TRUE view = 4 (or view - 0, I’m not sure)

First Predictor - sqft_living

model1 <- lm(price ~ sqft_living,
             data = houses_tidy)

summary(model1)
## 
## Call:
## lm(formula = price ~ sqft_living, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1476062  -147486   -24043   106182  4362067 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43580.743   4402.690  -9.899   <2e-16 ***
## sqft_living    280.624      1.936 144.920   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 261500 on 21611 degrees of freedom
## Multiple R-squared:  0.4929, Adjusted R-squared:  0.4928 
## F-statistic: 2.1e+04 on 1 and 21611 DF,  p-value: < 2.2e-16
autoplot(model1)

plot(model1)

Not sure these diagnostic plots look great but have plotted the data below and I think the issue is caused by the shape of the data. It appears to have very defined cut-offs but I don’t know why?

houses %>% 
  ggplot(aes(price, sqft_living)) +
  geom_point(alpha = 0.1)

Conclusion I have reservations about the shape of the data and consquently the diagnostic plots but lets go with this for now. R2 = 0.493 rse = 261500 (this is really quite high!) p-value < 0.01

Second Predictor

The next strongest correlation for the numeric variables was sqft_above but this is really very similar to sqft_living so lets not do it next. Bathrooms has a moderate correlation with price so should have tried that next but in my exhausted delirium I added bedrooms by accident. Realise when I came back to check things over but when I changed bedrooms to bathrooms I found it didn’t improved the model at all so maybe bedrooms is the better chose as 2nd predictor.

model2 <- lm(price ~ sqft_living + bedrooms, 
             data = houses_tidy)

summary(model2)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1650867  -143866   -23143   102344  4179850 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  79469.359   6604.764   12.03   <2e-16 ***
## sqft_living    313.949      2.337  134.31   <2e-16 ***
## bedrooms    -57066.759   2308.223  -24.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 257800 on 21610 degrees of freedom
## Multiple R-squared:  0.5068, Adjusted R-squared:  0.5068 
## F-statistic: 1.11e+04 on 2 and 21610 DF,  p-value: < 2.2e-16

R2 = 0.508 so not added much from model 1 (R2 = 0.493) rse = 257500 still very high

autoplot(model2)

plot(model2)

We seem to be fitting a negative values - what is going on here?

houses %>% 
  ggplot(aes(price, bedrooms)) + 
  geom_point()

So there is a house with over 30 bedrooms. This was investigated. This house is noted as having 33 bedrooms but costing $640000. All the other houses that cost this much (there were 21) had between 2 and 6 bedrooms, this is therefore assumed to be a typo and the number of bedrooms changed to 3.

houses_tidy <- houses_tidy %>% 
  mutate(bedrooms = ifelse(bedrooms > 30, 3, bedrooms)) 

Run model and diagnostic again…

model2b <- lm(price ~ sqft_living + bedrooms, 
              data = houses_tidy)

summary(model2b)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1666625  -143358   -23058   102392  4163512 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  90059.793   6733.139   13.38   <2e-16 ***
## sqft_living    316.914      2.365  134.00   <2e-16 ***
## bedrooms    -62063.528   2392.305  -25.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 257500 on 21610 degrees of freedom
## Multiple R-squared:  0.5082, Adjusted R-squared:  0.5081 
## F-statistic: 1.116e+04 on 2 and 21610 DF,  p-value: < 2.2e-16

The R2 and rse are similar to model2 but the diagnostics should look better.

autoplot(model2b)

Hmmm, for plot 2 the residuals are getting larger as the prices get higher. For scale-location, the blue line is not flat and I think it should be. Am starting to have my doubts about using sqft_livng, there is something not right with this data.

Does sqft_above look any better?

houses_tidy %>% 
  ggplot(aes(price, sqft_above)) +
  geom_point()

No the sqft_above data looks the same as sqft_living

What if we just try bedrooms on its own as the first predictor?

First predictor (again!)

model1b <- lm(price ~ bedrooms, 
              data = houses_tidy)

summary(model1b)
## 
## Call:
## lm(formula = price ~ bedrooms, data = houses_tidy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -993323 -203016  -65422  105984 6824400 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   110333       9108   12.11   <2e-16 ***
## bedrooms      127544       2610   48.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 348400 on 21611 degrees of freedom
## Multiple R-squared:  0.0995, Adjusted R-squared:  0.09946 
## F-statistic:  2388 on 1 and 21611 DF,  p-value: < 2.2e-16

R2 is low, bedrooms isn’t explaining much of the variation. I think we have to go with sqft_living.

Lets press on with model2b (price ~ sqft_living + bedrooms(with 33 altered))

Third Predictor

model3 <- lm(price ~ sqft_living + bedrooms + waterfront, 
             data = houses_tidy)

summary(model3)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms + waterfront, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1547815  -139924   -20327   103317  4271814 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     83206.467   6497.811   12.80   <2e-16 ***
## sqft_living       304.569      2.302  132.29   <2e-16 ***
## bedrooms       -54179.832   2316.240  -23.39   <2e-16 ***
## waterfrontTRUE 790888.877  19706.699   40.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248400 on 21609 degrees of freedom
## Multiple R-squared:  0.5423, Adjusted R-squared:  0.5422 
## F-statistic:  8534 on 3 and 21609 DF,  p-value: < 2.2e-16

We’re up to R2 = 0.542 now, rse reducing slightly but still very high. p-values very low and diagnostic plots ok

autoplot(model3)

We’re up to R2 = 0.542 now, rse reducing slightly but still very high. p-values very low and diagnostic plots ok (ish) (I think??) Something odd has happened to leverage graph

Fourth Predictor

Lets look at the residuals to see what can best explain them.

houses_resid <- houses_tidy %>% 
  add_residuals(model3)
houses_tidy_numeric <- houses_resid %>%
  select_if(is.numeric)

houses_tidy_nonnumeric <- houses_resid %>%
  select_if(function(x) !is.numeric(x))

houses_tidy_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_tidy_numeric, progress = FALSE)

Correlation with Residuals - Numeric Columns

latitude - weak positive (0.396) close to being moderate though yr_built - weak negative (-0.240)

ggpairs(houses_tidy_nonnumeric, progress = FALSE)

Correlation with Residuals - Non-Numeric Columns

grade = “high” is showing a bit of correlation None of the others are very convincing

Add grade as 4th predictor

model4 <- lm(price ~ sqft_living + bedrooms + waterfront + grade,
             data = houses_tidy)

summary(model4)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms + waterfront + grade, 
##     data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1583426  -136185   -22248    99412  4189856 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     82809.000 120395.386   0.688    0.492    
## sqft_living       264.470      2.477 106.770  < 2e-16 ***
## bedrooms       -40777.877   2274.891 -17.925  < 2e-16 ***
## waterfrontTRUE 759150.255  19121.073  39.702  < 2e-16 ***
## gradeaverage    28275.650 120492.322   0.235    0.814    
## gradehigh      485138.645 121159.406   4.004 6.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 240800 on 21607 degrees of freedom
## Multiple R-squared:   0.57,  Adjusted R-squared:  0.5699 
## F-statistic:  5728 on 5 and 21607 DF,  p-value: < 2.2e-16

R2 is now 0.57 with rse 240800 grade = “high” has a p-value < 0.01 but grade = “average” has a high p-value and is not significant.

autoplot(model4)

plot(model4)

anova(model3, model4)

Adding grade is statistically significant

anova(model2b, model3)

Adding waterfront is statistically significant.

Third Predictor (again)

What if we try latitude as 3rd predictor?

model3b <- lm(price ~ sqft_living + bedrooms + lat, 
              data = houses_tidy)

summary(model3b)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1657228  -121478   -19654    80401  4191985 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.333e+07  5.584e+05  -59.69   <2e-16 ***
## sqft_living  3.073e+02  2.196e+00  139.93   <2e-16 ***
## bedrooms    -5.520e+04  2.219e+03  -24.88   <2e-16 ***
## lat          7.026e+05  1.174e+04   59.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 238500 on 21609 degrees of freedom
## Multiple R-squared:  0.5781, Adjusted R-squared:  0.578 
## F-statistic:  9870 on 3 and 21609 DF,  p-value: < 2.2e-16

Adding latitude as 3rd predictor takes us up to R2 = 0.5781

Fourth Predictor (again)

Lets add previous 3rd predictor in which was waterfront

model4b <- lm(price ~ sqft_living + bedrooms + lat + waterfront,
              data = houses_tidy)

summary(model4b)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat + waterfront, 
##     data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1534034  -116206   -16352    81061  4304633 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3.392e+07  5.338e+05  -63.55   <2e-16 ***
## sqft_living     2.944e+02  2.118e+00  138.96   <2e-16 ***
## bedrooms       -4.691e+04  2.128e+03  -22.04   <2e-16 ***
## lat             7.150e+05  1.122e+04   63.71   <2e-16 ***
## waterfrontTRUE  8.190e+05  1.809e+04   45.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 227900 on 21608 degrees of freedom
## Multiple R-squared:  0.6147, Adjusted R-squared:  0.6146 
## F-statistic:  8617 on 4 and 21608 DF,  p-value: < 2.2e-16

Cool, now we’re at R2 0.6147 rse still horribly high p-values all very low

autoplot(model4b)

Residuals

Lets look at the residuals again to see what can best explain them.

houses_resid <- houses_tidy %>% 
  add_residuals(model4b)
houses_tidy_numeric <- houses_resid %>%
  select_if(is.numeric)

houses_tidy_nonnumeric <- houses_resid %>%
  select_if(function(x) !is.numeric(x))

houses_tidy_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_tidy_numeric, progress = FALSE)

yr_built (-0.188) and longitude (-0.153) both have very weak negative correlations with the residuals

ggpairs(houses_tidy_nonnumeric, progress = FALSE)

Could try view? Grade doesn’t seem to make sense anymore as the lowest grade has a higher median than average and isn’t much different from the high level?

Seventh Predictor!!!

What happens if we add all our remaining potential predictors?? Woohoo, am going nuts!

model5 <- lm(price ~ sqft_living + bedrooms + lat + waterfront + long + view + grade,
              data = houses_tidy)

summary(model5)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat + waterfront + 
##     long + view + grade, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1762508  -104948   -11871    76237  4116519 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -5.696e+07  1.350e+06 -42.199   <2e-16 ***
## sqft_living     2.496e+02  2.324e+00 107.401   <2e-16 ***
## bedrooms       -3.027e+04  2.022e+03 -14.971   <2e-16 ***
## lat             6.761e+05  1.062e+04  63.667   <2e-16 ***
## waterfrontTRUE  4.949e+05  2.079e+04  23.803   <2e-16 ***
## long           -2.051e+05  1.090e+04 -18.814   <2e-16 ***
## view1           1.289e+05  1.187e+04  10.856   <2e-16 ***
## view2           9.913e+04  7.151e+03  13.863   <2e-16 ***
## view3           1.724e+05  9.787e+03  17.618   <2e-16 ***
## view4           3.335e+05  1.513e+04  22.037   <2e-16 ***
## gradeaverage   -1.755e+05  1.066e+05  -1.646   0.0997 .  
## gradehigh       2.463e+05  1.072e+05   2.298   0.0216 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 212800 on 21601 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.664 
## F-statistic:  3883 on 11 and 21601 DF,  p-value: < 2.2e-16

Still only at R2 = 0.664 rse = 212800 so still very big

The grades aren’t statistically significant which isn’t that surprising. Lets change it to condition.

model5b <- lm(price ~ sqft_living + bedrooms + lat + waterfront + long + view + condition,
              data = houses_tidy)

summary(model5b)
## 
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat + waterfront + 
##     long + view + condition, data = houses_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1752924  -108306   -10720    82648  4144185 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -5.723e+07  1.391e+06 -41.150  < 2e-16 ***
## sqft_living     2.897e+02  2.196e+00 131.910  < 2e-16 ***
## bedrooms       -4.519e+04  2.058e+03 -21.953  < 2e-16 ***
## lat             6.847e+05  1.093e+04  62.629  < 2e-16 ***
## waterfrontTRUE  5.060e+05  2.137e+04  23.680  < 2e-16 ***
## long           -2.021e+05  1.124e+04 -17.980  < 2e-16 ***
## view1           1.236e+05  1.221e+04  10.127  < 2e-16 ***
## view2           9.910e+04  7.353e+03  13.477  < 2e-16 ***
## view3           1.742e+05  1.006e+04  17.311  < 2e-16 ***
## view4           3.451e+05  1.556e+04  22.184  < 2e-16 ***
## condition2      3.018e+04  4.329e+04   0.697  0.48569    
## condition3      2.548e+04  4.002e+04   0.637  0.52442    
## condition4      6.352e+04  4.008e+04   1.585  0.11302    
## condition5      1.089e+05  4.033e+04   2.700  0.00693 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 218700 on 21599 degrees of freedom
## Multiple R-squared:  0.6453, Adjusted R-squared:  0.6451 
## F-statistic:  3022 on 13 and 21599 DF,  p-value: < 2.2e-16

Only condition = 5 is statistically significant. I think this means I have to group them?

Might have to give up here and go to bed. Night night zzzzzzzzz